Abstract
When using datasets gathered from mediums designed for casual conversation such as twitter, the problem with these datasets is the large amount of sarcasm present in these datasets. With sarcasm being difficult to detect by humans and Natural Language processing models this can hinder the model’s accuracy. As a result, we hoped to create an NLP model specifically designed to detect sarcasm for other NLP models. To do this, two datasets will be utilized,ISarcasm and Tweets with Sarcasm and Irony. In addition, these datasets will be trained on with five different models; Recurrent Neural Networks (RNN), Support Vector Machine (SVM), Random Forests, Decision Trees, and XGboost. By combining our model with other NLP models, we hope to increase the accuracy of these models.
Dataset
The dataset,Tweets with Sarcasm and Irony, contains one column of random tweets across the social media platform Twitter that is categorized into 4 classes: irony, sarcasm, figurative (both irony and sarcasm), and regular. The dataset consists of 2 files: train.csv and test.csv. Both files share the same columns and classes with different values. The collection being used in the data set is the tweepy package which is Twitter API and tweepy to collect tweets using respective tweet IDs. Social media nowadays is hard to interpret the emotional display of posts of comments so with this dataset we plan to turn it into insightful information by exploring potential insights that we can gather.
With this dataset, we convert it into two files: Tweets with hashtags, Tweets without hashtags
This is done because hashtags will cause the model to overfit to the dataset as seen with our results.
The other dataset we use isISarcasm. ISarcasm is a dataset designed specifically for training a model without the bias found in many sarcasm datasets
Links: ISarcasm:https://paperswithcode.com/dataset/isarcasm, Tweets with Sarcasm and Irony: https://www.kaggle.com/datasets/nikhiljohnk/tweets-with-sarcasm-and-irony
Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))
#train$tweets <- map(train$tweets, .f = function(x){
# str_squish(x)
#}) %>% unlist()
classes <- train$class %>% unique()
num_obs_train <- nrow(train)
num_obs_test <- nrow(test)
Classes: figurative, irony, regular, sarcasm
Number of observations in Train: 81408
Number of observations in Test: 8128
t <- train %>% group_by(class) %>% count()
t
train %>% filter(tweets == "" | tweets == " " | is.na(tweets))
tweet_lengths <- train$tweets %>% map(
.f = function(x){
str_count(x, pattern = " ") + 1
}) %>% unlist()
ids <- 1:nrow(train)
train_temp <- train %>% mutate(tweet_length = tweet_lengths,
id = ids)
train_temp %>% ggplot(aes(x = tweet_lengths)) +
geom_bar(aes(fill = after_stat(count)))
train_temp %>% ggplot(aes(y = tweet_lengths)) +
geom_boxplot()
*Tweet length range visualized with boxplot for each class
train_temp %>% ggplot(aes(x = class, y = tweet_lengths)) +
geom_boxplot()
Max tweet length: 67
Min tweet length: 1
Mean tweet length: 15.1798595
t <- train_temp %>% filter(tweet_lengths == max(tweet_lengths))
t
t <- train_temp %>% filter(tweet_lengths == min(tweet_lengths))
t
#------------------------------------------------------------------------------
#Function just so i don't loose my mind waiting for a function to finish
#P: Makes sure function does not print the same percentage: initialize p = 0
#outside the loop
#Length: How long the loop is
#i: the iterator
print_percent <- function(i, length, p) {
percent <- floor((i/length * 100))
if(percent %% 10 == 0 && p != percent){
print(paste0(percent,"% Complete"))
p = percent
}
return(p)
}
#------------------------------------------------------------------------------
#Seperates hashtags from text
#Takes in a column of text and returns a list of hash tags
get_hashtags_df <- function(text) {
tweets <- text
tweets_separated <- tweets %>% str_split(pattern = " ")
y <- list()
p = 0
for (i in 1:length(tweets_separated)) {
hashtags <- list()
for(k in 1:length(tweets_separated[[i]])){
if(grepl(tweets_separated[[i]][k], pattern = "#.*")){
hashtags <- append(hashtags,tweets_separated[[i]][k])
}
}
#print(hashtags)
y <- append(y,list(hashtags))
#assign("y", y, envir = .GlobalEnv)
#print(y)
p = print_percent(i,length = length(tweets_separated), p = p)
#print()#," Percent complete")
#print(tweets_separated[[i]])
}
y
}
Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))
outlierSubsetFigurative <- subset(freq_figurative, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetFigurative %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
outlierSubsetIrony <- subset(freq_irony, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetIrony %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
outlierSubsetSarcasm <- subset(freq_sarcasm, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetSarcasm %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
outlierSubsetRegular <- subset(freq_regular, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetRegular %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
frequencys <- full_join(freq_figurative,freq_irony, by = "Var1") %>%
full_join(freq_regular, by = "Var1") %>%
full_join(freq_sarcasm, by = "Var1") %>%
rename(figurative = Freq.x,
irony = Freq.y,
regular = Freq.x.x,
sarcasm = Freq.y.y)
frequencys_2_class <- full_join(freq_regular,freq_not_regular, by = "Var1") %>%
rename(regular = Freq.x,
not_regular = Freq.y)
frequencys_2_class[frequencys_2_class == 0] <- 1
frequencys_2_class[is.na(frequencys_2_class)] <- 1
frequencys[frequencys == 0] <- 1
frequencys[is.na(frequencys)] <- 1
frequencys <- frequencys %>%
mutate(figurative_prop = figurative/(irony * regular * sarcasm)) %>%
mutate(irony_prop = irony/(figurative * regular * sarcasm)) %>%
mutate(sarcasm_prop = sarcasm/(figurative * regular * irony)) %>%
mutate(regular_prop = regular/(figurative * sarcasm * irony))
frequencys_2_class <- frequencys_2_class %>%
mutate(prop = regular/not_regular) %>%
mutate(inv_prop = not_regular/regular)
max = 60
#graph_freq <- function(df, max_entries = 60) {
frequencys %>%
arrange(desc(regular_prop)) %>%
slice(1:max) %>%
ggplot(aes(y = regular_prop, x = reorder(Var1, order(regular_prop, decreasing = TRUE)))) +
geom_bar(stat='identity') +
ggtitle("regular Outliers") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
#}
frequencys_2_class %>%
arrange(desc(prop)) %>%
slice(1:max) %>%
ggplot(aes(y = prop, x = reorder(Var1, order(prop, decreasing = TRUE)))) +
geom_bar(stat='identity') +
ggtitle("Not regular Outliers") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
max = 60
frequencys_tmp_prop <- frequencys_2_class %>%
arrange(desc(prop)) %>%
slice(1:max)
frequencys_tmp_inv_prop <- frequencys_2_class %>%
arrange(desc(inv_prop)) %>%
slice(1:max)
ggplot(frequencys_tmp_prop, aes(label = Var1, size = prop)) +
geom_text_wordcloud_area(eccentricity = .54, color = 'red') +
#scale_size_area(max_size = 30) +
theme_minimal() +
ggtitle ('Words found in tweets that are considered regular')
ggplot(frequencys_tmp_inv_prop, aes(label = Var1, size = inv_prop)) +
geom_text_wordcloud_area(eccentricity = .54, color = 'blue') +
#scale_size_area(max_size = 30) +
theme_minimal() +
ggtitle ('Words found in tweets that are not considered regular (irony, figurative, sarcasm)')
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=12' are unlikely values in pixels
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=16' are unlikely values in pixels
preprocessing <- function(data) {
require('tm')
data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
data$tweets <- tolower(data$tweets)
data$tweets <- removePunctuation(data$tweets)
data$tweets <- removeWords(data$tweets, words = stopwords('en'))
#data$tweets <- data$tweets[data$tweets != ""]
data
}
Dir = Dir_ISarcasm
train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')
train <- preprocessing(train)
training_labels <- (train$class %>% as.array() %>% as.double())
## Warning in train$class %>% as.array() %>% as.double(): NAs introduced by
## coercion
Dir_Main <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
Dir_ISarcasm <- 'Datasets/ISarcasm'
tensorflow::tf$python$client$device_lib$list_local_devices() %>% print()
## [[1]]
## name: "/device:CPU:0"
## device_type: "CPU"
## memory_limit: 268435456
## locality {
## }
## incarnation: 9840211226849380258
## xla_global_id: -1
##
##
## [[2]]
## name: "/device:GPU:0"
## device_type: "GPU"
## memory_limit: 5719982080
## locality {
## bus_id: 1
## links {
## }
## }
## incarnation: 7942840533864140915
## physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
## xla_global_id: 416903419
#-------------------------------------------------------------------
even_out_observations <- function(data){
regular <- data %>% filter(class == 0)
sarcasm <- data %>% filter(class == 1)
#sarcasm$class = "sarcasm"
num_regular <- regular %>% nrow()
sarcasm <- sarcasm[1:num_regular,]
data <- rbind(regular,sarcasm)
data <- data[sample(1:nrow(data)), ]
data
}
#-------------------------------------------------------------------
retrieve_dataset_ISarcasm <- function(Dir = Dir_ISarcasm, binary = FALSE) {
train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')# %>% rename(tweets = tweet, class = sarcastic)
test <- read.csv(paste0(Dir,"/test.csv"),fileEncoding = 'utf-8') #%>% rename(tweets = tweet, class = sarcastic)
preprocessing <- function(data) {
require('tm')
data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
data$tweets <- tolower(data$tweets)
data$tweets <- removePunctuation(data$tweets)
data$tweets <- removeWords(data$tweets, words = stopwords('en'))
#data$tweets <- data$tweets[data$tweets != ""]
data
}
train <- preprocessing(train)
test <- preprocessing(test)
factor_set <- function(set) {
set$class[set$class == 'regular'] = 0
set$class[set$class == 'sarcasm'] = 1
if(!binary) {
set$class[set$class == 'figurative'] = 2
set$class[set$class == 'irony'] = 3
} else {
set$class[set$class == 'figurative'] = 1
set$class[set$class == 'irony'] = 1
}
set
}
train <- factor_set(train)
test <- factor_set(test)
index <- createDataPartition(train$class, p = .8, list = FALSE)
train <- train[index,]
validation <- train[-index,]
training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
test_labels <- (test$class %>% as.array() %>% as.double())
list(train_set = train,
train_labels = training_labels,
test_set = test,
test_labels = test_labels,
validation_set = validation,
validation_labels = validation_labels)
}
#-------------------------------------------------------------------
#load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
retrieve_dataset <- function(Dir = 'Datasets/Tweets_with_Sarcasm_and_Irony', binary = FALSE, even_out = FALSE, without_hashtags = FALSE) {
if(!without_hashtags){
train <- read_csv(paste0(Dir,"/train.csv"))
test <- read_csv(paste0(Dir,"/test.csv"))
} else {
train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
}
#load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
#load('Datasets/Tweets_with_Sarcasm_and_Irony/train_w_hashtags.RData')
# train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
#test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
preprocessing <- function(data) {
require('tm')
data$tweets <- tolower(data$tweets)
data$tweets <- removePunctuation(data$tweets)
data$tweets <- removeWords(data$tweets, words = stopwords('en'))
data
}
train <- preprocessing(train)
test <- preprocessing(test)
test <- test %>% filter(!is.na(class))
factor_set <- function(set) {
set$class[set$class == 'regular'] = 0
set$class[set$class == 'sarcasm'] = 1
if(!binary) {
set$class[set$class == 'figurative'] = 2
set$class[set$class == 'irony'] = 3
} else {
set$class[set$class == 'figurative'] = 1
set$class[set$class == 'irony'] = 1
}
set
}
train <- factor_set(train)
test <- factor_set(test)
index <- createDataPartition(train$class, p = .8, list = FALSE)
train <- train[index,]
validation <- train[-index,]
if(even_out && binary){
train <- even_out_observations(train)
test <- even_out_observations(test)
validation <- even_out_observations(validation)
}
training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
test_labels <- (test$class %>% as.array() %>% as.double())
list(train_set = train,
train_labels = training_labels,
test_set = test,
test_labels = test_labels,
validation_set = validation,
validation_labels = validation_labels)
}
#--------------------------------------------------------------------------------
generate_sequences <- function(train_data,#training data
validation_data,# validation data
testing_data,
maxlen = 50,#maximum length of the embedding sequence
max_words = 2000,
tokenizer = NULL)#will only choose consider max_words amount of words for the embedding
{
training_text <- train_data$tweets %>% as.array()#get the text
validation_text <- validation_data$tweets %>% as.array()#get the text
testing_text <- testing_data$tweets %>% as.array()
if(is.null(tokenizer)) {
tokenizer <- text_tokenizer(num_words = max_words) %>%#create and fit tokenizer
fit_text_tokenizer(training_text)
print('creating Tokenizer.....')
} else {
print('found tokenizer!')
}
sequences <- texts_to_sequences(tokenizer,training_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
training_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
sequences <- texts_to_sequences(tokenizer,validation_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
validation_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
sequences <- texts_to_sequences(tokenizer,testing_text)
testing_sequences <- pad_sequences(sequences, maxlen = maxlen)
list(train = training_sequences,
validation = validation_sequences,
test = testing_sequences,
tokenizer = tokenizer
)
}
#-------------------------------------------------------------------------------------------------------------------
Accuracy_Label_Table <- function (Labels, Guesses) {
Value_P <- function(Label, Guess){
bin <- as.integer( #Returns int equivalent of binary value Label,Guess
strtoi(
paste0(Label * 10 + Guess),
base = 2
)
)
arr <- c("TN", #Label = 0, Guess = 0
"FP", #Label = 0, Guess = 1
"FN", #Label = 1, Guess = 0
"TP" #Label = 1, Guess = 1
)
return(arr[bin+1])
}
result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
TN_Count <- result[result == "TN"] %>% length()
FP_Count <- result[result == "FP"] %>% length()
FN_Count <- result[result == "FN"] %>% length()
TP_Count <- result[result == "TP"] %>% length()
group = c("True Negative (TN)", #Label = 0, Guess = 0
"False Positive (FP)", #Label = 0, Guess = 1
"False Negative (FN)", #Label = 1, Guess = 0
"True Positive (TP)" #Label = 1, Guess = 1
)
value = c(TN_Count,
FP_Count,
FN_Count,
TP_Count)
data.frame(group = group,
value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
a_table <- Accuracy_Label_Table(Labels = Labels,
Guesses = Guesses)
N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)
plt <- a_table %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
geom_label(aes(label = value),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y") +
scale_fill_manual(values = c("#FFABAB", "#FFB092",
"#b4d4fa", "#BFFCC6"),
guide = guide_legend(reverse = TRUE)) +
ggtitle("TP, TN, FP, FN Pie Chart") +
theme_void()
plt <- ggdraw(plt)
plt <- plt +
annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
plt
}
#-------------------------------------------------------------------------------------------
one_hot_encode <- function(train,validation,test, max_words, tokenizer = NA) {
training_text <- train %>% as.array()
validation_text <- validation %>% as.array()
testing_text <- test %>% as.array()
if(!is.na(tokenizer)){
tokenizer <- text_tokenizer(num_words = max_words) %>%
fit_text_tokenizer(training_text)
}
train_one_hot_matrix <- texts_to_matrix(tokenizer, training_text, mode = "binary")#Translates text to a matrix of 0 or 1 where 0 == word NOT present and 1 == word present
#word_index <- tokenizer$word_index #The dictionary to translate a sequence to a sentence
validation_one_hot_matrix <- texts_to_matrix(tokenizer, validation_text, mode = "binary")
test_one_hot_matrix <- texts_to_matrix(tokenizer, testing_text, mode = "binary")
list(train = train_one_hot_matrix,
valdiation = validation_one_hot_matrix,
test = test_one_hot_matrix,
tokenizer = tokenizer)
}
max_words = 1000
embedding_dim = 8
maxlen = 50
sets <- retrieve_dataset_ISarcasm(binary = TRUE
)
train <- sets$train_set
training_labels <- sets$train_labels
validation <- sets$validation_set
validation_labels <- sets$validation_labels
test <- sets$test_set
test_labels <- sets$test_labels
sequences <- generate_sequences(train,
validation,
test,
maxlen = maxlen,
max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test
model <- keras_model_sequential() %>%
layer_embedding(input_dim = max_words,
output_dim = embedding_dim,
input_length = maxlen) %>%
bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
layer_lstm(units = 64, return_sequences = FALSE) %>%
layer_flatten() %>%
layer_dense(units = 1,
activation = "sigmoid")
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = "accuracy"
)
history <- model %>% fit(
training_sequences,
training_labels,
epochs = 17,
batch_size = 128,
validation_data= list(validation_sequences,validation_labels)
)
results <- model %>% evaluate(test_sequences,test_labels)
results
create_train_test <- function(data, size = 0.8, train = TRUE) {
#Shuffle Data
data <- data[sample(1:nrow(data)), ]
n_row = nrow(data)
total_row = size * n_row
train_sample <- 1: total_row
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
test <- read_csv("Datasets/Tweets_with_Sarcasm_and_Irony/test_without_hashtags.csv")
## New names:
## Rows: 8128 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): tweets, class dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
train <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/train_without_hashtags.csv")
test <- select(test, class, tweets)
train <- select(train, class,tweets)
#Run these two lines if you are only using sarcasm and regular in the model
train[train == "figurative"] <- "sarcasm"
train[train == "irony"] <- "sarcasm"
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
## Warning in tm_map.SimpleCorpus(corpus, PlainTextDocument): transformation drops
## documents
corpus <- tm_map(corpus,tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops documents
corpus <- tm_map(corpus,removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus,removeWords,stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus,stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
freq <- DocumentTermMatrix(corpus)
#Remove rare words
sparse <- removeSparseTerms(freq,.995)
#Create a matrix of the words
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class
#Split the data
trainSet <- create_train_test(tSparse,.8,train = T)
testSet <- create_train_test(tSparse,.8,train = F)
#Check the proportion of each class
prop.table(table(tSparse$class))
##
## regular sarcasm
## 0.2284174 0.7715826
#Make the Model
#This is where you tweak the parameters to get the tree you want. This is also where the pruning is done.
fit <- rpart(class~., data = trainSet,
method = "class",
minsplit = 28,
cp = .00045,
maxsurrogate = 1)
#Shows the significance of the words it is using in the model
printcp(fit)
##
## Classification tree:
## rpart(formula = class ~ ., data = trainSet, method = "class",
## minsplit = 28, cp = 0.00045, maxsurrogate = 1)
##
## Variables actually used in tree construction:
## [1] cant drug get ironi just late like love money
## [10] peopl polit run sarcasm thank that
##
## Root node error: 14970/65126 = 0.22986
##
## n= 65126
##
## CP nsplit rel error xerror xstd
## 1 0.01249165 0 1.00000 1.00000 0.0071725
## 2 0.00096192 1 0.98751 0.98751 0.0071409
## 3 0.00055667 6 0.98270 0.98283 0.0071289
## 4 0.00046760 13 0.97876 0.98243 0.0071279
## 5 0.00045000 15 0.97782 0.98069 0.0071234
rpart.plot(fit)
#Making the prediction
prediction <- predict(fit, testSet, type = "class")
#prediction
#Testing the prediction against the training set
table_mat <- table(testSet$class, prediction)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.7765631
library('xgboost')
library('tm')
library('pacman')
library('tidyverse')
test <- read.csv("Datasets/ISarcasm/test.csv")
train <- read.csv("Datasets/ISarcasm/train.csv")
train <- train %>% filter(class!="")
train$class[train$class == "figurative"] = "sarcasm"
train$class[train$class == "irony"] = "sarcasm"
create_train_test <- function(data_size, size = 0.8, train = TRUE) {
set.seed(123) # Set seed for reproducibility
# Create shuffled indices
shuffled_indices <- sample(1:data_size)
# Calculate the number of rows for the train set
train_rows <- round(size * data_size)
if (train == TRUE) {
return (shuffled_indices[1:train_rows])
} else {
return (shuffled_indices[(train_rows + 1):data_size])
}
}
#CSC Project
#pacman::p_load(datasets,pacman, dplyr, GGally, ggplot2, ggthemes, ggvis,
# httr, lubridate, plotly, rio, rmarkdown, shiny,
# stringr, tidyverse, lessR, aplpack, readr, tm, SnowballC, rpart.plot)
#class and tweets
#-----------------------------------------------------------------------------------------------------------------------------
#Preprocessing train
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,stemDocument)
freq <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(freq,.995)
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class
#Prepare data for XGBoost
train_indices <- create_train_test(nrow(tSparse), 0.8, train = TRUE)
test_indices <- create_train_test(nrow(tSparse), 0.8, train = FALSE)
tSparse$class <- as.factor(tSparse$class)
tSparse$class <- as.numeric(tSparse$class) - 1
trainSet <- xgb.DMatrix(data.matrix(tSparse[train_indices, -ncol(tSparse)]), label = tSparse$class[train_indices])
testSet <- xgb.DMatrix(data.matrix(tSparse[test_indices, -ncol(tSparse)]), label = tSparse$class[test_indices])
#Set up XGBoost parameters
params <- list(
objective = "binary:logistic",
eval_metric = "error"
)
#Train the XGBoost model
unique(tSparse$class)
## [1] 1 0
num_classes <- length(unique(tSparse$class))
params <- list(
objective = "multi:softprob",
eval_metric = "mlogloss",
num_class = num_classes,
eta = 0.3,
max_depth = 6,
min_child_weight = 1,
subsample = 1,
colsample_bytree = 1,
gamma = 0
)
fit <- xgb.train(params,
data = trainSet,
nrounds = 100,
watchlist = list(test = testSet),
early_stopping_rounds = 10,
maximize = FALSE,
print_every_n = 10
)
## [1] test-mlogloss:0.437795
## Will train until test_mlogloss hasn't improved in 10 rounds.
##
## [11] test-mlogloss:0.016941
## [21] test-mlogloss:0.001089
## [31] test-mlogloss:0.000385
## [41] test-mlogloss:0.000329
## [51] test-mlogloss:0.000329
## Stopping. Best iteration:
## [43] test-mlogloss:0.000329
#Predict the test set
#prediction_prob <- predict(fit, testSet, output_margin = TRUE)
#prediction <- matrix(prediction_prob, ncol = num_classes, byrow = TRUE)
#prediction <- max.col(prediction) - 1
# Predict the probabilities of each class for the test set
predicted_probs <- predict(fit, testSet, output_margin = TRUE)
# Reshape the vector of probabilities into a matrix
predicted_probs_matrix <- matrix(predicted_probs, ncol = num_classes, byrow = TRUE)
# Find the class with the maximum probability for each observation
predicted_class_indices <- apply(predicted_probs_matrix, 1, which.max) - 1
true_labels <- tSparse$class[test_indices]
mean(true_labels == predicted_class_indices)
## [1] 1
Accuracy_Label_Table <- function (Labels, Guesses) {
Value_P <- function(Label, Guess){
bin <- as.integer( #Returns int equivalent of binary value Label,Guess
strtoi(
paste0(Label * 10 + Guess),
base = 2
)
)
arr <- c("TN",
"FP", #Label = 0, Guess = 1
"FN", #Label = 1, Guess = 0
"TP" #Label = 1, Guess = 1
)
return(arr[bin+1])
}
result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
TN_Count <- result[result == "TN"] %>% length()
FP_Count <- result[result == "FP"] %>% length()
FN_Count <- result[result == "FN"] %>% length()
TP_Count <- result[result == "TP"] %>% length()
group = c("True Negative (TN)", #Label = 0, Guess = 0
"False Positive (FP)", #Label = 0, Guess = 1
"False Negative (FN)", #Label = 1, Guess = 0
"True Positive (TP)" #Label = 1, Guess = 1
)
value = c(TN_Count,
FP_Count,
FN_Count,
TP_Count)
data.frame(group = group,
value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
require('cowplot')
require('ggrepel')
require('grid')
a_table <- Accuracy_Label_Table(Labels = Labels,
Guesses = Guesses)
N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)
plt <- a_table %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
geom_label(aes(label = value),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y") +
scale_fill_manual(values = c("#FFABAB", "#FFB092",
"#b4d4fa", "#BFFCC6"),
guide = guide_legend(reverse = TRUE)) +
ggtitle("TP, TN, FP, FN Pie Chart") +
theme_void()
plt <- ggdraw(plt)
plt <- plt +
annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
plt
}
FP_Pie_Chart(Labels = true_labels, Guesses = predicted_class_indices)